Resource for introduction to the tidyverse package
install.packages("tidyverse")
install.packages("gapminder")
library(tidyverse)
library(gapminder)
read_csv() # comma-separated values, as exported from excel/spreadsheets
read_delim() # any delimitter
# Other useful packages
readxl::read_excel() # By Jenny Bryan
We will work with the Gapminder dataset by Hans Rosling
Hans Rosling’s TED talks: https://www.ted.com/playlists/474/the_best_hans_rosling_talks_yo
# gapminder::gapminder
str(gapminder) # Structure of the dataframe
Classes 'tbl_df', 'tbl' and 'data.frame': 1704 obs. of 6 variables:
$ country : Factor w/ 142 levels "Afghanistan",..: 1 1 1 1 1 1 1 1 1 1 ...
$ continent: Factor w/ 5 levels "Africa","Americas",..: 3 3 3 3 3 3 3 3 3 3 ...
$ year : int 1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 ...
$ lifeExp : num 28.8 30.3 32 34 36.1 ...
$ pop : int 8425333 9240934 10267083 11537966 13079460 14880372 12881816 13867957 16317921 22227415 ...
$ gdpPercap: num 779 821 853 836 740 ...
gapminder # Data is in a cleaend up 'tibble' format by default
# A tibble: 1,704 x 6
country continent year lifeExp pop gdpPercap
<fct> <fct> <int> <dbl> <int> <dbl>
1 Afghanistan Asia 1952 28.8 8425333 779.
2 Afghanistan Asia 1957 30.3 9240934 821.
3 Afghanistan Asia 1962 32.0 10267083 853.
4 Afghanistan Asia 1967 34.0 11537966 836.
5 Afghanistan Asia 1972 36.1 13079460 740.
6 Afghanistan Asia 1977 38.4 14880372 786.
7 Afghanistan Asia 1982 39.9 12881816 978.
8 Afghanistan Asia 1987 40.8 13867957 852.
9 Afghanistan Asia 1992 41.7 16317921 649.
10 Afghanistan Asia 1997 41.8 22227415 635.
# ... with 1,694 more rows
head(gapminder) # Shows the top few observations (rows) of your data frame
# A tibble: 6 x 6
country continent year lifeExp pop gdpPercap
<fct> <fct> <int> <dbl> <int> <dbl>
1 Afghanistan Asia 1952 28.8 8425333 779.
2 Afghanistan Asia 1957 30.3 9240934 821.
3 Afghanistan Asia 1962 32.0 10267083 853.
4 Afghanistan Asia 1967 34.0 11537966 836.
5 Afghanistan Asia 1972 36.1 13079460 740.
6 Afghanistan Asia 1977 38.4 14880372 786.
glimpse(gapminder) # Info-dense summary of the data
Observations: 1,704
Variables: 6
$ country <fct> Afghanistan, Afghanistan, Afghanistan, Afghanis...
$ continent <fct> Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia,...
$ year <int> 1952, 1957, 1962, 1967, 1972, 1977, 1982, 1987,...
$ lifeExp <dbl> 28.801, 30.332, 31.997, 34.020, 36.088, 38.438,...
$ pop <int> 8425333, 9240934, 10267083, 11537966, 13079460,...
$ gdpPercap <dbl> 779.4453, 820.8530, 853.1007, 836.1971, 739.981...
View(gapminder) # View data in a visual GUI-based spreadsheet-like format
gather() # Gather COLUMNS -> ROWS
spread() # Spread ROWS -> COLUMNS
separate() # Separate 1 COLUMN -> many COLUMNS
unite() # Unite several COLUMNS -> 1 COLUMN
filter() # PICK observations by their values | ROWS
select() # PICK variables by their names | COLUMNS
mutate() # CREATE new variables w/ functions of existing variables | COLUMNS
transmute() # COMPUTE 1 or more COLUMNS but drop original columns
arrange() # REORDER the ROWS
summarise() # COLLAPSE many values to a single SUMMARY
group_by() # GROUP data into rows with the same value of variable (COLUMN)
str(gapminder) # Structure of the dataframe
Classes 'tbl_df', 'tbl' and 'data.frame': 1704 obs. of 6 variables:
$ country : Factor w/ 142 levels "Afghanistan",..: 1 1 1 1 1 1 1 1 1 1 ...
$ continent: Factor w/ 5 levels "Africa","Americas",..: 3 3 3 3 3 3 3 3 3 3 ...
$ year : int 1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 ...
$ lifeExp : num 28.8 30.3 32 34 36.1 ...
$ pop : int 8425333 9240934 10267083 11537966 13079460 14880372 12881816 13867957 16317921 22227415 ...
$ gdpPercap: num 779 821 853 836 740 ...
# Now, filter by year and look at only the data from the year 1962
filter(gapminder, year==1962)
# A tibble: 142 x 6
country continent year lifeExp pop gdpPercap
<fct> <fct> <int> <dbl> <int> <dbl>
1 Afghanistan Asia 1962 32.0 10267083 853.
2 Albania Europe 1962 64.8 1728137 2313.
3 Algeria Africa 1962 48.3 11000948 2551.
4 Angola Africa 1962 34 4826015 4269.
5 Argentina Americas 1962 65.1 21283783 7133.
6 Australia Oceania 1962 70.9 10794968 12217.
7 Austria Europe 1962 69.5 7129864 10751.
8 Bahrain Asia 1962 56.9 171863 12753.
9 Bangladesh Asia 1962 41.2 56839289 686.
10 Belgium Europe 1962 70.2 9218400 10991.
# ... with 132 more rows
# Can be rewritten using "Piping" %>%
gapminder %>% # Pipe ('then') operator to serially connect operations
filter(year==1962)
# A tibble: 142 x 6
country continent year lifeExp pop gdpPercap
<fct> <fct> <int> <dbl> <int> <dbl>
1 Afghanistan Asia 1962 32.0 10267083 853.
2 Albania Europe 1962 64.8 1728137 2313.
3 Algeria Africa 1962 48.3 11000948 2551.
4 Angola Africa 1962 34 4826015 4269.
5 Argentina Americas 1962 65.1 21283783 7133.
6 Australia Oceania 1962 70.9 10794968 12217.
7 Austria Europe 1962 69.5 7129864 10751.
8 Bahrain Asia 1962 56.9 171863 12753.
9 Bangladesh Asia 1962 41.2 56839289 686.
10 Belgium Europe 1962 70.2 9218400 10991.
# ... with 132 more rows
# Arrange/Sort by Life Expectency
arrange(gapminder, lifeExp) # ascending order
# A tibble: 1,704 x 6
country continent year lifeExp pop gdpPercap
<fct> <fct> <int> <dbl> <int> <dbl>
1 Rwanda Africa 1992 23.6 7290203 737.
2 Afghanistan Asia 1952 28.8 8425333 779.
3 Gambia Africa 1952 30 284320 485.
4 Angola Africa 1952 30.0 4232095 3521.
5 Sierra Leone Africa 1952 30.3 2143249 880.
6 Afghanistan Asia 1957 30.3 9240934 821.
7 Cambodia Asia 1977 31.2 6978607 525.
8 Mozambique Africa 1952 31.3 6446316 469.
9 Sierra Leone Africa 1957 31.6 2295678 1004.
10 Burkina Faso Africa 1952 32.0 4469979 543.
# ... with 1,694 more rows
arrange(gapminder, -lifeExp) # desceding order
# A tibble: 1,704 x 6
country continent year lifeExp pop gdpPercap
<fct> <fct> <int> <dbl> <int> <dbl>
1 Japan Asia 2007 82.6 127467972 31656.
2 Hong Kong, China Asia 2007 82.2 6980412 39725.
3 Japan Asia 2002 82 127065841 28605.
4 Iceland Europe 2007 81.8 301931 36181.
5 Switzerland Europe 2007 81.7 7554661 37506.
6 Hong Kong, China Asia 2002 81.5 6762476 30209.
7 Australia Oceania 2007 81.2 20434176 34435.
8 Spain Europe 2007 80.9 40448191 28821.
9 Sweden Europe 2007 80.9 9031088 33860.
10 Israel Asia 2007 80.7 6426679 25523.
# ... with 1,694 more rows
# Want to rewrite using piping?
gapminder %>% # Pipe ('then') operator to serially connect operations
arrange(lifeExp)
# A tibble: 1,704 x 6
country continent year lifeExp pop gdpPercap
<fct> <fct> <int> <dbl> <int> <dbl>
1 Rwanda Africa 1992 23.6 7290203 737.
2 Afghanistan Asia 1952 28.8 8425333 779.
3 Gambia Africa 1952 30 284320 485.
4 Angola Africa 1952 30.0 4232095 3521.
5 Sierra Leone Africa 1952 30.3 2143249 880.
6 Afghanistan Asia 1957 30.3 9240934 821.
7 Cambodia Asia 1977 31.2 6978607 525.
8 Mozambique Africa 1952 31.3 6446316 469.
9 Sierra Leone Africa 1957 31.6 2295678 1004.
10 Burkina Faso Africa 1952 32.0 4469979 543.
# ... with 1,694 more rows
library(tidyverse)
library(gapminder)
ggplot(gapminder, aes(gdpPercap, lifeExp, size = pop, colour = country)) +
geom_point(alpha = 0.7, show.legend = FALSE) +
scale_colour_manual(values = country_colors) +
scale_size(range = c(2, 12)) +
scale_x_log10() + theme_minimal() +
facet_wrap(~continent)
library(gganimate)
ggplot(gapminder, aes(gdpPercap, lifeExp, size = pop, colour = country)) +
geom_point(alpha = 0.7, show.legend = FALSE) +
scale_colour_manual(values = country_colors) +
scale_size(range = c(2, 12)) +
scale_x_log10() + theme_minimal() +
facet_wrap(~continent) +
# Here comes the gganimate specific bits
labs(title = 'Year: {frame_time}', x = 'GDP per capita', y = 'life expectancy') +
transition_time(year) +
ease_aes('linear')